library(tidyverse)
library(rmarkdown)
library(lubridate)
library(ggrepel)
states <- map_data("state")
state_names <- states %>% group_by(region) %>% summarize(long=mean(long), lat=mean(lat))
acc <- read.csv("https://raw.githubusercontent.com/xdaiISU/ds202materials/master/hwlabs/fars2017/accident.csv", stringsAsFactors = FALSE)
ppl <- read.csv("https://raw.githubusercontent.com/xdaiISU/ds202materials/master/hwlabs/fars2017/person.csv", stringsAsFactors = FALSE)
glc <- readxl::read_xlsx("FRPP_GLC.xlsx")
# Preprocess GLC
glc$STATE = as.numeric(glc$`State Code`)
glc_states <- glc %>% group_by(STATE) %>% summarize(state_name=first(`State Name`))
acc_with_state_names <- acc %>% left_join(glc_states, by=c('STATE'))
acc %>% group_by(DAY_WEEK) %>%
summarize(number_accidents=n()) %>%
mutate(weekday=c('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat')) %>%
select(-DAY_WEEK) %>%
subset(select=c(2,1))
## # A tibble: 7 x 2
## weekday number_accidents
## <chr> <int>
## 1 Sun 5360
## 2 Mon 4374
## 3 Tue 4347
## 4 Wed 4314
## 5 Thu 4621
## 6 Fri 5358
## 7 Sat 5873
Using the data manual, we know that values of DAY_WEEK 1-7 corresponds to days, Sun-Sat, which we can use to get the weekday label for each integer representation. In the resulting table, we can see that on Fridays, Saturdays, and Sundays, there are generally a significantly greater number of fatal accidents compared to Monday, Tuesday, Wednesday, or Thursday.
The greatest number of fatal accidents occur on Saturdays.
fatal <- ppl %>% filter(INJ_SEV==4)
head(fatal)
## STATE ST_CASE VE_FORMS VEH_NO PER_NO STR_VEH COUNTY DAY MONTH HOUR MINUTE
## 1 1 10001 1 1 1 0 73 19 2 23 35
## 2 1 10002 1 1 1 0 89 14 2 14 59
## 3 1 10003 3 2 1 0 101 31 1 20 31
## 4 1 10004 1 1 1 0 73 1 1 16 55
## 5 1 10005 1 1 1 0 13 1 1 20 0
## 6 1 10006 2 1 1 0 49 6 1 18 40
## RUR_URB FUNC_SYS HARM_EV MAN_COLL SCH_BUS MAKE MAK_MOD BODY_TYP MOD_YEAR
## 1 2 1 38 0 0 20 20421 15 2004
## 2 2 1 1 0 0 37 37402 14 2005
## 3 2 1 12 1 0 2 2404 14 2003
## 4 2 4 30 0 0 30 30046 4 2014
## 5 1 1 35 0 0 12 12471 34 2010
## 6 1 1 34 0 0 49 49032 4 1997
## TOW_VEH SPEC_USE EMER_USE ROLLOVER IMPACT1 FIRE_EXP AGE SEX PER_TYP INJ_SEV
## 1 1 0 0 0 12 0 42 1 1 4
## 2 0 0 0 9 0 0 43 1 1 4
## 3 0 0 0 0 6 0 47 1 1 4
## 4 0 0 0 1 11 0 18 1 1 4
## 5 0 0 0 1 12 0 67 2 1 4
## 6 0 0 0 1 11 0 18 2 1 4
## SEAT_POS REST_USE REST_MIS AIR_BAG EJECTION EJ_PATH EXTRICAT DRINKING ALC_DET
## 1 11 20 0 1 1 9 0 0 9
## 2 11 3 0 8 0 0 0 9 9
## 3 11 3 0 99 0 0 0 0 9
## 4 11 3 0 8 0 0 0 9 9
## 5 11 3 0 1 0 0 0 0 9
## 6 11 20 0 20 1 9 0 0 9
## ALC_STATUS ATST_TYP ALC_RES DRUGS DRUG_DET DSTATUS DRUGTST1 DRUGTST2 DRUGTST3
## 1 0 0 996 0 8 0 0 0 0
## 2 2 1 0 9 8 2 1 1 1
## 3 2 1 0 0 8 2 1 0 0
## 4 0 0 996 9 8 0 0 0 0
## 5 0 0 996 0 8 0 0 0 0
## 6 0 0 996 0 8 0 0 0 0
## DRUGRES1 DRUGRES2 DRUGRES3 HOSPITAL DOA DEATH_DA DEATH_MO DEATH_YR DEATH_HR
## 1 0 0 0 0 7 19 2 2017 23
## 2 605 600 996 0 7 14 2 2017 14
## 3 1 0 0 0 7 31 1 2017 20
## 4 0 0 0 0 7 1 1 2017 16
## 5 0 0 0 0 7 1 1 2017 20
## 6 0 0 0 0 7 6 1 2017 18
## DEATH_MN DEATH_TM LAG_HRS LAG_MINS P_SF1 P_SF2 P_SF3 WORK_INJ HISPANIC RACE
## 1 35 2335 0 0 0 0 0 0 7 1
## 2 59 1459 0 0 0 0 0 0 7 1
## 3 31 2031 0 0 0 0 0 0 7 2
## 4 55 1655 0 0 0 0 0 0 7 2
## 5 0 2000 0 0 0 0 0 0 7 1
## 6 40 1840 0 0 0 0 0 0 7 1
## LOCATION
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
Here we see a few entries of the dataset created which contains only the people who suffered fatal injuries.
# First get the MAKE for each accident
dat <- acc_with_state_names %>% inner_join(ppl, by=c('ST_CASE', 'STATE')) %>% select(STATE, state_name, MAKE)
most_dangerous <- dat %>%
filter(!is.na(MAKE)) %>%
group_by(STATE, state_name, MAKE) %>%
summarize(number_accidents=n()) %>%
filter(number_accidents==max(number_accidents))
## `summarise()` has grouped output by 'STATE', 'state_name'. You can override using the `.groups` argument.
most_dangerous %>% paged_table
Here we can see the most dangerous make ID for each state ID, as well as the number of fatal accidents for each of those state/make combinations.
states$region = toupper(states$region)
avg_state_locations <- states %>% group_by(region) %>% summarize(avg_long=mean(long), avg_lat=mean(lat))
dangerous_with_location <- most_dangerous %>% mutate(region=state_name) %>% inner_join(avg_state_locations, by=c('region'))
ggplot(states, aes(x=long, y=lat)) + geom_polygon(aes(group=group)) +
geom_text_repel(data=dangerous_with_location, aes(x=avg_long, y=avg_lat, label=MAKE), color='green')
Here we can see the most dangerous vehicle (most fatal accidents) make code for each state.
acc_ppl <- acc %>% inner_join(ppl)
## Joining, by = c("STATE", "ST_CASE", "VE_FORMS", "COUNTY", "DAY", "MONTH", "HOUR", "MINUTE", "RUR_URB", "FUNC_SYS", "HARM_EV", "MAN_COLL", "SCH_BUS")
paged_table(acc_ppl)